R code for lecture 29

# Example of reparameterizing a model to yield cell means
 
quinn <- read.csv('ecol 563/quinn1.csv')
quinn$Density <- factor(quinn$Density)
# effects model
out1 <- lm(Eggs~Density*Season, data=quinn)
summary(out1)
# cell means model
out2 <- lm(Eggs~Density:Season-1, data=quinn)
# models have same AIC
AIC(out1, out2)
# cell means
summary(out2)
# confidence intervals for means
confint(out2)
 
# binary data example
parasites <- read.table('ecol 563/Parasite.txt',header=T)
parasites[1:8,]
# fit model with all predictors
out1 <- glm(infection~age+weight+sex, data=parasites, family=binomial)
summary(out1)
# Wald test says age is not significant, compare with LR test
out2 <- glm(infection~weight+sex, data=parasites, family=binomial)
anova(out2,out1,test='Chisq')
 
# odds ratios from model
exp(coef(out1)[2:4])
 
 
# default plot of response is a mosaic plot
plot(infection~weight, data=parasites)
# plot binary data
plot(as.numeric(infection)-1~weight, data=parasites)
# jitter the data and color the observations by sex
plot(jitter(as.numeric(infection))-1 ~ jitter(weight), data=parasites, col=as.numeric(sex))
# decrease the jitter
plot(jitter(as.numeric(infection), a=.05)-1 ~ jitter(weight, a=.1), data=parasites, col=as.numeric(sex), xlab='weight', ylab='Probability')
# model with weight and sex
coef(out2)
# linear predictor
eta <- function(x,sex) coef(out2)[1] + coef(out2)[2]*x + coef(out2)[3]*(sex=='male')
eta(1,'female')
eta(1,'male')
# inverse logit
inv.logit <- function(eta) exp(eta)/(1+exp(eta))
 
# add predicted probabilities to graph
plot(jitter(as.numeric(infection),a=.05)-1 ~ jitter(weight,a=.1), data=parasites, col=as.numeric(sex), xlab='weight', ylab='Probability')
curve(inv.logit(eta(x,'female')), add=T, lty=2)
curve(inv.logit(eta(x,'male')), add=T, lty=2, col=2)
 
# assess functional forms of predictors with a GAM
library(mgcv)
out1.gam <- gam(infection ~ sex + s(weight)+s(age), data=parasites, family=binomial)
 
# plot estimated smmooths
plot(out1.gam)
 
# plot smooth for age
plot(out1.gam, select=2)
abline(h=0, lty=2, col=2)
# plot smooth for weight
plot(out1.gam, select=1)
abline(h=0, lty=2, col=2)
 
# add a quadratic term in age
out3 <- glm(formula = infection ~ age +I(age^2) + weight + sex, family = binomial, data=parasites)
# LR test for quadatic age
anova(out1, out3, test='Chisq')
summary(out3)
 
# re-examine smooth for weight with model containing quadratic age
out1.gam <- gam(infection ~ sex + s(weight) + age + I(age^2), data=parasites, family=binomial)
plot(out1.gam)
 
# fit breakpoint model for weight with a break point of 8
out4 <- glm(formula = infection ~ age + I(age^2) + sex + I((weight>8)*(weight-8)), family = binomial, data=parasites)
# this is an improvement
AIC(out3, out4)
summary(out4)
 
# possible choices for break points
sort(unique(parasites$weight))
cvec <- 3:16
 
# function to fit model at specific break points
c.func <- function(c) {
mymodel <- glm(infection ~ sex + age + I(age^2) + I((weight>=c)*(weight-c)), data=parasites, family=binomial)
c(c, logLik(mymodel), AIC(mymodel))}
 
# run function
sapply(cvec, c.func)
 
# best break point is c = 12
out6 <- glm(infection ~ sex + age + I(age^2) + I((weight>=12)*(weight-12)), data=parasites, family=binomial)
summary(out6)
 
# compare models
AIC(out1, out3, out6)
 
# comparison is unfair because we're not counting break point as a parameter
my.aic <- AIC(out1, out3, out6)
# add to AIC
my.aic[,"AIC"] <- my.aic[,"AIC"]+c(0,0,2)
# add parameter
my.aic[,"df"] <- my.aic[,"df"]+c(0,0,1)
 
#### model calibration ###########
 
# predicted probabilities
round(fitted(out6),3)
# decision rule: classify as infected if p > 0.5
# confusion matrix
tab0 <- table(fitted(out6)>0.5, parasites$infection)
tab0
# calculate specificity and sensitivity
diag(tab0)/apply(tab0,2,sum)
 
# function to fit breakpoint model
precision <- function(c,model, response) {
tab0 <- table(fitted(model)>=c, response)
rownames(tab0)<-c('-','+')
out <- diag(tab0)/apply(tab0, 2, sum)
names(out) <- c('specificity', 'sensitivity')
list(tab0, out)
}
precision(.5, out6, parasites$infection)
 
# obtaining specificity and sensitivity
library(ROCR)
# create prediction object
pred1 <- prediction(fitted(out6), parasites$infection)
# extract true positive and true negative rates
stats1 <- performance(pred1, 'tpr', 'tnr')
# the result is an S4 object
str(stats1)
stats1@y.values
stats1@x.values
stats1@alpha.values
# change infinite value to 1
stats1@alpha.values[[1]][1]<-1
 
# plot specificity
plot(stats1@alpha.values[[1]], stats1@x.values[[1]], type='s', xlab='cut-off', ylab='Classification rate')
# add sensitivity
lines(stats1@alpha.values[[1]], stats1@y.values[[1]], type='s', col=2)
legend('right', c('specificity', 'sensitivity'), col=1:2, lty=1, bty='n', cex=.9)
 
# determinine minimized difference threshold (MDT)
which.min(abs(stats1@x.values[[1]]-stats1@y.values[[1]]))
stats1@alpha.values[[1]][26]
precision(stats1@alpha.values[[1]][26], out6,parasites$infection)
abline(v=stats1@alpha.values[[1]][26], lty=2, col=4)
 
# determinine maximized sum threshold (MST)
which.max(stats1@x.values[[1]]+stats1@y.values[[1]])
stats1@alpha.values[[1]][37]
abline(v=stats1@alpha.values[[1]][37], col=3, lty=2)
 
############ ROC curves ##########
 
stats1a <- performance(pred1,'tpr','fpr')
str(stats1a)
plot(stats1a@x.values[[1]], stats1a@y.values[[1]], type='s', col='grey70', lwd=2, xlab='FPR', ylab='TPR')
 
# obtain TPR and FPR of a second model
pred2 <- prediction(fitted(out1), parasites$infection)
stats2a <- performance(pred2,'tpr','fpr')
# add ROC curve to previous graph
lines(stats2a@x.values[[1]], stats2a@y.values[[1]], type='s', col=2)
 
##### Area under the curve ######
stats1b <- performance(pred1, 'auc')
stats2b <- performance(pred2, 'auc')
str(stats1b)
stats1b@y.values
stats2b@y.values
 
#### 10-fold cross-validation of binary model ######
library(DAAG)
cv.binary(out6)
cv.binary(out1)
R Code for Lecture 29 (Wednesday, December 5, 2012)